#include <isl/aff.h>
#include <isl/ast.h>
#include <isl/isl_ast_private.h>

#include "slave.h"
#include "sw.h"
#include "sw_common.h"
#include "slave_print.h"
#include "slave_group.h"
#include "slave_array_tile.h"
#include "print.h"
#include "util.h"

/* Print a declaration for the device array corresponding to "array" on "p".
 */
static __isl_give isl_printer *declare_host_array(__isl_take isl_printer *p,
 	struct slave_array_info *array)
{
	int i;

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "extern ");
	p = isl_printer_print_str(p, array->type);
	p = isl_printer_print_str(p, " ");
	//if (!array->linearize && array->n_index > 1)
	//	p = isl_printer_print_str(p, "(");
	p = isl_printer_print_str(p, array->name);
	//if (!array->linearize && array->n_index > 1) {
	//	p = isl_printer_print_str(p, ")");
	for (i = 0; i < array->n_index; i++) {
		isl_ast_expr *bound;
		bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
		p = isl_printer_print_str(p, "[");
		p = isl_printer_print_ast_expr(p, bound);
		p = isl_printer_print_str(p, "]");
		isl_ast_expr_free(bound);
	}
	//}
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *declare_host_arrays(__isl_take isl_printer *p,
	struct slave_prog *prog)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		if (!slave_array_requires_device_allocation(&prog->array[i]))
			continue;

		p = declare_host_array(p, &prog->array[i]);
	}
	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);
	return p;
}

/* Print a declaration for the device array corresponding to "array" on "p".
 */
static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
 	struct slave_local_array_info *array)
{
	int i, j;
	struct slave_array_info *host_array = array->array;
	struct slave_array_tile *tile;

	p = isl_printer_start_line(p);
	//p = isl_printer_print_str(p, "__thread_local ");
	p = isl_printer_print_str(p, host_array->type);
	for (j = 0; j < array->n_group; j++){
		struct slave_array_ref_group *group;
		group = array->groups[j];
		tile = group->ldm_tile;
		if (j != 0)
			p = isl_printer_print_str(p, ",");
		p = isl_printer_print_str(p, " slave_");
		p = isl_printer_print_str(p, host_array->name);
		if (array->n_group > 1) {
			p = isl_printer_print_str(p, "_");
			p = isl_printer_print_int(p, group->nr);
		}
		for (i = 0; i < array->n_index; i++) {
			isl_val *size;
			size = isl_val_copy(tile->bound[i].size);
			p = isl_printer_print_str(p, "[");
			p = isl_printer_print_val(p, size);
			p = isl_printer_print_str(p, "]");
			isl_ast_expr_free(size);
		}
	}
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
	struct slave_prog *prog, struct ppcg_kernel *kernel)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		//if (!slave_array_requires_device_allocation(&prog->array[i]))
		if (!ppcg_kernel_requires_array_argument_for_sw(kernel, i))
			continue;
		if (kernel->array[i].n_index == 0)
			continue;

		p = declare_device_array(p, &kernel->array[i]);
	}
	return p;
}

static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
	struct slave_prog *prog)
{
	int i;

	/*for (i = 0; i < prog->n_array; ++i) {
		if (!slave_array_requires_device_allocation(&prog->array[i]))
			continue;
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
		p = isl_printer_print_str(p, prog->array[i].name);
		p = isl_printer_print_str(p, "));");
		p = isl_printer_end_line(p);
	}*/

	return p;
}

/* Print code to "p" for copying "array" from the host to the device
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * gpu_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
	struct slave_array_info *array)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "get_reply = 0;");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "athread_get(PE_MODE, ");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", ");

	//if (gpu_array_is_scalar(array))
	//	p = isl_printer_print_str(p, "&");
	p = isl_printer_print_str(p, "slave_");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", ");

	p = slave_array_info_print_size(p, array);
	p = isl_printer_print_str(p, ", &get_reply, 0, 0, 0);");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "while (get_reply != 1);");
	p = isl_printer_end_line(p);

	return p;
}

/* Print code to "p" for copying "array" from the host to the device
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * gpu_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_from_device(__isl_take isl_printer *p,
	struct slave_array_info *array)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "put_reply = 0;");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "athread_put(PE_MODE, ");
	p = isl_printer_print_str(p, "slave_");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", ");

	//if (gpu_array_is_scalar(array))
	//	p = isl_printer_print_str(p, "&");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", ");

	p = slave_array_info_print_size(p, array);
	p = isl_printer_print_str(p, ", &put_reply, 0, 0);");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "while (put_reply != 1);");
	p = isl_printer_end_line(p);

	return p;
}
															
/* Print the arguments to a kernel declaration or call.  If "types" is set,
 * then print a declaration (including the types of the arguments).
 *
 * The arguments are printed in the following order
 * - the arrays accessed by the kernel
 * - the parameters
 * - the host loop iterators
 */
static __isl_give isl_printer *print_kernel_arguments_host(__isl_take isl_printer *p,
	struct slave_prog *prog, struct ppcg_kernel *kernel)
{
	int i, n;
	unsigned nparam;
	isl_space *space;
	const char *type;
	int sum, count = 0;

	for (i = 0; i < prog->n_array; ++i) {
		int required;
		required = ppcg_kernel_requires_array_argument_for_sw(kernel, i);
		if (required)
			count ++;
	}

	space = isl_union_set_get_space(kernel->arrays);
	nparam = isl_space_dim(space, isl_dim_param);
	n = isl_space_dim(kernel->space, isl_dim_set);
	sum = count + nparam + n;
	
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "unsigned long arg");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "[");
	p = isl_printer_print_int(p, sum);
	p = isl_printer_print_str(p, "];");
	p = isl_printer_end_line(p);

	count = 0;
	for (i = 0; i < prog->n_array; ++i) {
		int required;

		required = ppcg_kernel_requires_array_argument_for_sw(kernel, i);
		if (required < 0)
			return isl_printer_free(p);
		if (!required)
			continue;

		struct slave_array_info *array = &prog->array[i];
		if (array->n_index) {
			p = isl_printer_start_line(p);
			p = isl_printer_print_str(p, "arg");
			p = isl_printer_print_int(p, kernel->id);
			p = isl_printer_print_str(p, "[");
			p = isl_printer_print_int(p, count);
			p = isl_printer_print_str(p, "] = ");
			p = isl_printer_print_str(p, array->name);
			p = isl_printer_print_str(p, ";");
			p = isl_printer_end_line(p);
			count ++;
		}
		else {
			p = isl_printer_start_line(p);
			p = isl_printer_print_str(p, "arg");
			p = isl_printer_print_int(p, kernel->id);
			p = isl_printer_print_str(p, "[");
			p = isl_printer_print_int(p, count);
			p = isl_printer_print_str(p, "] = &");
			p = isl_printer_print_str(p, array->name);
			p = isl_printer_print_str(p, ";");
			p = isl_printer_end_line(p);
			count ++;
		}
	}

	for (i = 0; i < nparam; ++i) {
		const char *name;

		name = isl_space_get_dim_name(space, isl_dim_param, i);

		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "arg");
		p = isl_printer_print_int(p, kernel->id);
		p = isl_printer_print_str(p, "[");
		p = isl_printer_print_int(p, count);
		p = isl_printer_print_str(p, "] = &");
		p = isl_printer_print_str(p, name);
		p = isl_printer_print_str(p, ";");
		p = isl_printer_end_line(p);
		count ++;
	}
	isl_space_free(space);

	type = isl_options_get_ast_iterator_type(prog->ctx);
	for (i = 0; i < n; ++i) {
		const char *name;

		name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "arg");
		p = isl_printer_print_int(p, kernel->id);
		p = isl_printer_print_str(p, "[");
		p = isl_printer_print_int(p, count);
		p = isl_printer_print_str(p, "] = &");
		p = isl_printer_print_str(p, name);
		p = isl_printer_print_str(p, ";");
		p = isl_printer_end_line(p);
		count ++;
	}

	return p;
}

static __isl_give isl_printer *print_kernel_arguments_slave(__isl_take isl_printer *p,
	struct slave_prog *prog, struct ppcg_kernel *kernel)
{
	int i, n;
	unsigned nparam;
	isl_space *space;
	const char *type;
	int sum, count = 0;

	space = isl_union_set_get_space(kernel->arrays);
	nparam = isl_space_dim(space, isl_dim_param);
	n = isl_space_dim(kernel->space, isl_dim_set);
	
	count = 0;
	for (i = 0; i < prog->n_array; ++i) {
		int required;

		required = ppcg_kernel_requires_array_argument_for_sw(kernel, i);
		if (required < 0)
			return isl_printer_free(p);
		if (!required)
			continue;

		struct slave_array_info *array = &prog->array[i];
		if (array->n_index) {
			p = isl_printer_start_line(p);
			p = isl_printer_print_str(p, array->type);
			p = isl_printer_print_str(p, "* ");
			p = isl_printer_print_str(p, array->name);
			p = isl_printer_print_str(p, " = ");
			p = isl_printer_print_str(p, "arg");
			p = isl_printer_print_int(p, kernel->id);
			p = isl_printer_print_str(p, "[");
			p = isl_printer_print_int(p, count);
			p = isl_printer_print_str(p, "];");
			p = isl_printer_end_line(p);
			count ++;
		} else if (array->read_only_scalar){
			p = isl_printer_start_line(p);
			p = isl_printer_print_str(p, array->type);
			p = isl_printer_print_str(p, " ");
			p = isl_printer_print_str(p, array->name);
			p = isl_printer_print_str(p, " = *(");
			p = isl_printer_print_str(p, array->type);
			p = isl_printer_print_str(p, " *)arg");
			p = isl_printer_print_int(p, kernel->id);
			p = isl_printer_print_str(p, "[");
			p = isl_printer_print_int(p, count);
			p = isl_printer_print_str(p, "];");
			p = isl_printer_end_line(p);
			count ++;
		} else {
			p = isl_printer_start_line(p);
			p = isl_printer_print_str(p, array->type);
			p = isl_printer_print_str(p, " slave_");
			p = isl_printer_print_str(p, array->name);
			p = isl_printer_print_str(p, " = *(");
			p = isl_printer_print_str(p, array->type);
			p = isl_printer_print_str(p, " *)arg");
			p = isl_printer_print_int(p, kernel->id);
			p = isl_printer_print_str(p, "[");
			p = isl_printer_print_int(p, count);
			p = isl_printer_print_str(p, "];");
			p = isl_printer_end_line(p);
			count ++;
		}
	}

	for (i = 0; i < nparam; ++i) {
		const char *name;

		name = isl_space_get_dim_name(space, isl_dim_param, i);

		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "int* ");
		p = isl_printer_print_str(p, name);
		p = isl_printer_print_str(p, " = arg");
		p = isl_printer_print_int(p, kernel->id);
		p = isl_printer_print_str(p, "[");
		p = isl_printer_print_int(p, count);
		p = isl_printer_print_str(p, "];");
		p = isl_printer_end_line(p);
		count ++;
	}
	isl_space_free(space);

	type = isl_options_get_ast_iterator_type(prog->ctx);
	for (i = 0; i < n; ++i) {
		const char *name;

		name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, type);
		p = isl_printer_print_str(p, " ");
		p = isl_printer_print_str(p, name);
		p = isl_printer_print_str(p, " = *(");
		p = isl_printer_print_str(p, type);
		p = isl_printer_print_str(p, " *)arg");
		p = isl_printer_print_int(p, kernel->id);
		p = isl_printer_print_str(p, "[");
		p = isl_printer_print_int(p, count);
		p = isl_printer_print_str(p, "];");
		p = isl_printer_end_line(p);
		count ++;
	}

	return p;
}

/* Print the header of the given kernel.
 */
static __isl_give isl_printer *print_kernel_header_in_c(__isl_take isl_printer *p,
 	struct slave_prog *prog, struct ppcg_kernel *kernel)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "void kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "(");
	p = isl_printer_print_str(p, "unsigned long* arg");
	p = isl_printer_print_int(p, kernel->id);
	//p = print_kernel_arguments(p, prog, kernel, 1);
	p = isl_printer_print_str(p, ")");

	return p;
}

/* Print the header of the given kernel to _slave.h.
 */
static void print_kernel_headers_in_c(struct slave_prog *prog,
  	struct ppcg_kernel *kernel, struct sw_info *sw)
{
	isl_printer *p;

	p = isl_printer_to_file(prog->ctx, sw->slave_c);
	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
	p = print_kernel_header_in_c(p, prog, kernel);
	p = isl_printer_end_line(p);
	isl_printer_free(p);
}

/* Print the header of the given kernel.
 */
static __isl_give isl_printer *print_kernel_header_in_h(__isl_take isl_printer *p,
 	struct slave_prog *prog, struct ppcg_kernel *kernel)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "extern SLAVE_FUN (kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, ")");
	p = isl_printer_print_str(p, "(");
	p = isl_printer_print_str(p, "unsigned long* arg");
	p = isl_printer_print_int(p, kernel->id);
	//p = print_kernel_arguments(p, prog, kernel, 1);
	p = isl_printer_print_str(p, ")");

	return p;
}

/* Print the header of the given kernel to _slave.h.
 */
static void print_kernel_headers_in_h(struct slave_prog *prog,
  	struct ppcg_kernel *kernel, struct sw_info *sw)
{
	isl_printer *p;

	p = isl_printer_to_file(prog->ctx, sw->slave_h);
	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
	p = print_kernel_header_in_h(p, prog, kernel);
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);
	isl_printer_free(p);
}

static void print_indent(FILE *dst, int indent)
{
	fprintf(dst, "%*s", indent, "");
}

/* Print a list of iterators of type "type" with names "ids" to "out".
 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
 * In particular, the last iterator is assigned the x identifier
 * (the first in the list of cuda identifiers).
 */
static void print_iterators(FILE *out, const char *type,
	__isl_keep isl_id_list *ids, const char *thread_ids[])
{
	int i, n;

	n = isl_id_list_n_id(ids);
	if (n <= 0)
		return;
	print_indent(out, 4);
	fprintf(out, "%s ", type);
	for (i = 0; i < n; ++i) {
		isl_id *id;

		if (i)
			fprintf(out, ", ");
		id = isl_id_list_get_id(ids, i);
		fprintf(out, "%s = %s", isl_id_get_name(id),
									thread_ids[n - 1 - i]);
		isl_id_free(id);
	}
	fprintf(out, ";\n");
}

static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
{
	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
	const char *type;
	const char *thread_ids[] = { "rid", "cid"};

	type = isl_options_get_ast_iterator_type(ctx);

	print_iterators(out, type, kernel->block_ids, thread_ids);
}

static __isl_give isl_printer *print_kernel_rma_reply(__isl_take isl_printer *p,
	struct slave_group_rma_reply_info *reply)
{
	int j;

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "crts_rply_t ");
	p = isl_printer_print_str(p,  reply->reply_l);
	p = isl_printer_print_str(p, ", ");
	p = isl_printer_print_str(p,  reply->reply_r);
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *print_kernel_reply(__isl_take isl_printer *p,
	struct ppcg_kernel_reply *reply)
{
	int j;

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "crts_rply_t ");
	p = isl_printer_print_str(p,  reply->name);
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *print_kernel_replys(__isl_take isl_printer *p,
	struct ppcg_kernel *kernel)
{
	int i, j;

	for (i = 0; i < kernel->n_array; ++i){
		struct slave_local_array_info *array = &kernel->array[i];

		for (j = 0; j < array->n_group; ++j){
			struct slave_array_ref_group *group = array->groups[j];
			
			if (group->rma_info && !group->buffer) {
				p = print_kernel_rma_reply(p, group->rma_info->rma_reply);
				continue;
			}
			if (group->flag_get_reply == 0 && group->flag_put_reply == 0)
				continue;
			if (group->flag_get_reply)
				p = print_kernel_reply(p, group->get_reply);
			if (group->flag_put_reply)
				p = print_kernel_reply(p, group->put_reply);
		}
	}
	return p;
}

/* Print a sync statement for a RMA statement.
 */
 static __isl_give isl_printer *print_rma_sync(__isl_take isl_printer *p,
 	struct ppcg_kernel_stmt *stmt)
{
	struct slave_array_ref_group *group;
	struct slave_group_rma_reply_info *rma_reply;
	struct slave_group_rma_info *rma_info;
	group = stmt->u.s.group;
	rma_info = group->rma_info;
	rma_reply = stmt->u.s.rma_reply;
	if (rma_info->row) {
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "if (");
		p = isl_printer_print_ast_expr(p, rma_info->iterator);
		p = isl_printer_print_str(p, " == cid)");
		p = isl_printer_end_line(p);
		p = isl_printer_indent(p, 4);
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "CRTS_rma_wait_value (&");
		p = isl_printer_print_str(p, rma_reply->reply_l);
		p = isl_printer_print_str(p, ", 1);");
		p = isl_printer_end_line(p);
    	p = isl_printer_indent(p, -4); 
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "else");
		p = isl_printer_end_line(p);
		p = isl_printer_indent(p, 4);
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "CRTS_rma_wait_value (&");
		p = isl_printer_print_str(p, rma_reply->reply_r);
		p = isl_printer_print_str(p, ", 1);");
		p = isl_printer_end_line(p);
    	p = isl_printer_indent(p, -4); 
	} else if (rma_info->col) {
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "if (");
		p = isl_printer_print_ast_expr(p, rma_info->iterator);
		p = isl_printer_print_str(p, " == rid)");
		p = isl_printer_end_line(p);
		p = isl_printer_indent(p, 4);
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "CRTS_rma_wait_value (&");
		p = isl_printer_print_str(p, rma_reply->reply_l);
		p = isl_printer_print_str(p, ", 1);");
		p = isl_printer_end_line(p);
    	p = isl_printer_indent(p, -4); 
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "else");
		p = isl_printer_end_line(p);
		p = isl_printer_indent(p, 4);
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "CRTS_rma_wait_value (&");
		p = isl_printer_print_str(p, rma_reply->reply_r);
		p = isl_printer_print_str(p, ", 1);");
		p = isl_printer_end_line(p);
    	p = isl_printer_indent(p, -4); 
	}

	return p;
}

static __isl_give isl_printer *print_update(__isl_take isl_printer *p,
	struct ppcg_kernel_stmt *stmt)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_ast_expr(p, stmt->u.u.iterator);
	p = isl_printer_print_str(p, " += ");
	p = isl_printer_print_val(p, stmt->u.u.inc);
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

/* Print a sync statement.
 */
 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 	struct ppcg_kernel_stmt *stmt)
{
	struct slave_array_ref_group *group;
	struct ppcg_kernel_reply *reply;
	group = stmt->u.s.group;
	reply = stmt->u.s.reply;
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "CRTS_dma_wait_value (&");
	p = isl_printer_print_str(p, reply->name);
	p = isl_printer_print_str(p, ", ");
	if (reply->n_reply_dim == 0)
		p = isl_printer_print_str(p, "1");
	else {
		for(int i = 0; i < reply->n_reply_dim; i++){
			p = isl_printer_print_str(p, "((");
			p = isl_printer_print_ast_expr(p, reply->group_reply[i]->reply_upper);
			p = isl_printer_print_str(p, " - ");
			p = isl_printer_print_ast_expr(p, reply->group_reply[i]->reply_lower);
			p = isl_printer_print_str(p, ") / ");
			p = isl_printer_print_ast_expr(p, reply->group_reply[i]->reply_inc);
			p = isl_printer_print_str(p, " + 1)");
			if (i != reply->n_reply_dim - 1)
				p = isl_printer_print_str(p, " * ");
		}
	}
	p = isl_printer_print_str(p, ");");
	p = isl_printer_end_line(p);

	return p;
}

/* Do we need to print a block around the body "node" of a for or if node?
 *
 * If the node is a block, then we need to print a block.
 * Also if the node is a degenerate for then we will print it as
 * an assignment followed by the body of the for loop, so we need a block
 * as well.
 * If the node is an if node with an else, then we print a block
 * to avoid spurious dangling else warnings emitted by some compilers.
 * If the node is a mark, then in principle, we would have to check
 * the child of the mark node.  However, even if the child would not
 * require us to print a block, for readability it is probably best
 * to print a block anyway.
 * If the ast_always_print_block option has been set, then we print a block.
 */
static int need_block(__isl_keep isl_ast_node *node)
{
	isl_ctx *ctx;

	if (node->type == isl_ast_node_block)
		return 1;
	if (node->type == isl_ast_node_for && node->u.f.degenerate)
		return 1;
	if (node->type == isl_ast_node_if && node->u.i.else_node)
		return 1;
	if (node->type == isl_ast_node_mark)
		return 1;

	ctx = isl_ast_node_get_ctx(node);
	return isl_options_get_ast_always_print_block(ctx);
}

/* Print the start of a compound statement.
 */
static __isl_give isl_printer *start_block(__isl_take isl_printer *p)
{
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "{");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 4);

    return p;
}

/* Print the end of a compound statement.
 */
static __isl_give isl_printer *end_block(__isl_take isl_printer *p)
{
    p = isl_printer_indent(p, -4); 
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "}");
    p = isl_printer_end_line(p);

    return p;
}

/* Assign "aff" to *user and return -1, effectively extracting
 * the first (and presumably only) affine expression in the isl_pw_aff
 * on which this function is used.
 */
static isl_stat extract_single_piece(__isl_take isl_set *set,
    __isl_take isl_aff *aff, void *user)
{
    isl_aff **p = user;

    *p = aff;
    isl_set_free(set);

    return isl_stat_error;
}

static __isl_give isl_printer *print_rma(__isl_take isl_printer *p,
	__isl_keep isl_ast_node *node)
{
	isl_id *id;
	struct ppcg_kernel_stmt *stmt;
	struct slave_array_ref_group *group;
	struct slave_array_tile *tile;
	isl_fixed_box *box;
	isl_multi_val *size;
	isl_val *size0, *size1;
	
	id = isl_ast_node_get_annotation(node);
	stmt = isl_id_get_user(id);
	isl_id_free(id);
	group = stmt->u.c.group;
	box = isl_map_get_range_simple_fixed_box_hull(group->access);
	size = isl_fixed_box_get_size(box);
	size0 = isl_multi_val_get_val(size, 0);
	size1 = isl_multi_val_get_val(size, 1);
	isl_fixed_box_free(box);
	isl_multi_val_free(size);

	p = start_block(p);
	p = isl_printer_start_line(p);
	if (group->rma_info->row)
		p = isl_printer_print_str(p, "//RMA_row");
	else
		p = isl_printer_print_str(p, "//RMA_col");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, group->rma_info->rma_reply->reply_l);
	p = isl_printer_print_str(p, " = 0;");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, group->rma_info->rma_reply->reply_r);
	p = isl_printer_print_str(p, " = 0;");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "CRTS_ssync_array();");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	if (group->rma_info->row) {
		p = isl_printer_print_str(p, "if (");
		p = isl_printer_print_ast_expr(p, group->rma_info->iterator);
		p = isl_printer_print_str(p, " == cid)");
	} else {
		p = isl_printer_print_str(p, "if (");
		p = isl_printer_print_ast_expr(p, group->rma_info->iterator);
		p = isl_printer_print_str(p, " == rid)");
	}
	p = isl_printer_end_line(p);
	p = isl_printer_indent(p, 4);
	p = isl_printer_start_line(p);
	if (group->rma_info->row) {
		p = isl_printer_print_str(p, "CRTS_rma_row_ibcast (slave_");
		p = isl_printer_print_str(p, group->array->name);
		p = isl_printer_print_str(p, "_");
		p = isl_printer_print_int(p, group->nr);
		p = isl_printer_print_str(p, ", slave_");
		p = isl_printer_print_str(p, group->array->name);
		p = isl_printer_print_str(p, "_");
		p = isl_printer_print_int(p, group->rma_info->from_group);
		p = isl_printer_print_str(p, ", (");
		p = isl_printer_print_val(p, size0);
		p = isl_printer_print_str(p, " * ");
		p = isl_printer_print_val(p, size1);
		p = isl_printer_print_str(p, ") * sizeof(");
		p = isl_printer_print_str(p, group->array->type);
		p = isl_printer_print_str(p, "), &");
		p = isl_printer_print_str(p, group->rma_info->rma_reply->reply_l);
		p = isl_printer_print_str(p, ", &");
		p = isl_printer_print_str(p, group->rma_info->rma_reply->reply_r);
		p = isl_printer_print_str(p, ");");
	} else {
		p = isl_printer_print_str(p, "CRTS_rma_col_ibcast (slave_");
		p = isl_printer_print_str(p, group->array->name);
		p = isl_printer_print_str(p, "_");
		p = isl_printer_print_int(p, group->nr);
		p = isl_printer_print_str(p, ", slave_");
		p = isl_printer_print_str(p, group->array->name);
		p = isl_printer_print_str(p, "_");
		p = isl_printer_print_int(p, group->rma_info->from_group);
		p = isl_printer_print_str(p, ", (");
		p = isl_printer_print_val(p, size0);
		p = isl_printer_print_str(p, " * ");
		p = isl_printer_print_val(p, size1);
		p = isl_printer_print_str(p, ") * sizeof(");
		p = isl_printer_print_str(p, group->array->type);
		p = isl_printer_print_str(p, "), &");
		p = isl_printer_print_str(p, group->rma_info->rma_reply->reply_l);
		p = isl_printer_print_str(p, ", &");
		p = isl_printer_print_str(p, group->rma_info->rma_reply->reply_r);
		p = isl_printer_print_str(p, ");");
	}
	p = isl_printer_end_line(p);
	p = isl_printer_indent(p, -4);
	p = end_block(p);

	isl_val_free(size0);	
	isl_val_free(size1);	
	return p;
}

static __isl_give isl_printer *print_dma(__isl_take isl_printer *p,
  	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	isl_id *id, *id0, *id1;
	struct ppcg_kernel_stmt *stmt;
	isl_ast_expr *expr, *arg;
	const char *name0, *name1;
	struct slave_local_array_info *array;
	struct slave_array_ref_group *group;
	const char *type;

	id = isl_ast_node_get_annotation(node);
	stmt = isl_id_get_user(id);
	isl_id_free(id);
	array = stmt->u.c.local_array;
	group = stmt->u.c.group;
	
	isl_ast_print_options_free(print_options);

	if (group->rma_info)
		return print_rma(p, node);

	isl_val *gsize;
	isl_pw_aff *pa;
	isl_aff *aff = NULL;
    pa = isl_multi_pw_aff_get_pw_aff(group->array->bound, group->array->n_index-1);
    isl_pw_aff_foreach_piece(pa, &extract_single_piece, &aff);
    gsize = isl_aff_get_constant_val(aff);
	isl_pw_aff_free(pa);
	isl_aff_free(aff);

	type = isl_options_get_ast_iterator_type(isl_printer_get_ctx(p));
	id0 = isl_ast_expr_get_id(node->u.d.iterator0);
	name0 = isl_id_get_name(id0);
	isl_id_free(id0);
	if (node->u.d.depth == 2) {
		id1 = isl_ast_expr_get_id(node->u.d.iterator1);
		name1 = isl_id_get_name(id1);
		isl_id_free(id1);
	}

	p = start_block(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "//DMA for ");
	p = isl_printer_print_str(p, group->array->name);
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	if (stmt->u.c.read)
		p = isl_printer_print_str(p, group->get_reply->name);
	else
		p = isl_printer_print_str(p, group->put_reply->name);
    p = isl_printer_print_str(p, " = 0;");
	p = isl_printer_end_line(p);
	if (node->u.d.depth == 2) {
    	p = isl_printer_start_line(p);
    	p = isl_printer_print_str(p, type);
    	p = isl_printer_print_str(p, " ");
    	p = isl_printer_print_str(p, name1);
    	p = isl_printer_print_str(p, " = ");
    	p = isl_printer_print_ast_expr(p, node->u.d.lower1);
    	p = isl_printer_print_str(p, ";");
    	p = isl_printer_end_line(p);
	}
   	p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, type);
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, name0);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_ast_expr(p, node->u.d.lower0);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);

    if (stmt->u.c.read) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "CRTS_dma_iget_stride (&");
        p = stmt_print_local_index(p, stmt);
        p = isl_printer_print_str(p, ", &");
        p = stmt_print_global_index(p, stmt);
        p = isl_printer_print_str(p, ", (");
		if (node->u.d.depth == 1) {
    		p = isl_printer_print_ast_expr(p, node->u.d.upper0);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower0);
        	p = isl_printer_print_str(p, " + 1 ");
		} else if (node->u.d.depth == 2) {
        	p = isl_printer_print_str(p, "((");
    		p = isl_printer_print_ast_expr(p, node->u.d.upper0);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower0);
        	p = isl_printer_print_str(p, ") / ");
			p = isl_printer_print_ast_expr(p, node->u.d.stride0);
        	p = isl_printer_print_str(p, " + 1) * ");
        	p = isl_printer_print_str(p, "((");
    		p = isl_printer_print_ast_expr(p, node->u.d.upper1);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower1);
        	p = isl_printer_print_str(p, ") / ");
			p = isl_printer_print_ast_expr(p, node->u.d.stride1);
        	p = isl_printer_print_str(p, " + 1)");
		}
        p = isl_printer_print_str(p, ") * ");
        p = isl_printer_print_str(p, "sizeof(");
        p = isl_printer_print_str(p, array->array->type);
        p = isl_printer_print_str(p, "), ");
		if (node->u.d.depth == 2) {
        	p = isl_printer_print_str(p, "(");
			p = isl_printer_print_ast_expr(p, node->u.d.upper0);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower0);
        	p = isl_printer_print_str(p, " + 1 ) * sizeof(");
			p = isl_printer_print_str(p, array->array->type);
        	p = isl_printer_print_str(p, "), ");
        	p = isl_printer_print_str(p, "((");
			p = isl_printer_print_ast_expr(p, node->u.d.stride1);
        	p = isl_printer_print_str(p, " * ");
			p = isl_printer_print_val(p, gsize);
			p = isl_printer_print_str(p, ") - (");
			p = isl_printer_print_ast_expr(p, node->u.d.upper0);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower0);
        	p = isl_printer_print_str(p, " + 1 ))");
			p = isl_printer_print_str(p, " * sizeof(");
			p = isl_printer_print_str(p, array->array->type);
        	p = isl_printer_print_str(p, ")");
		} else if ((node->u.d.depth == 1) && 
				(isl_val_get_num_si(isl_ast_expr_get_val(node->u.d.stride0)) != 1)){
			p = isl_printer_print_str(p, "1 * sizeof(");
			p = isl_printer_print_str(p, array->array->type);
        	p = isl_printer_print_str(p, "), (");
			p = isl_printer_print_ast_expr(p, node->u.d.stride0);
			p = isl_printer_print_str(p, " - 1 ) * sizeof(");
			p = isl_printer_print_str(p, array->array->type);
			p = isl_printer_print_str(p, ")");
		} else 
        	p = isl_printer_print_str(p, "0, 0");
        p = isl_printer_print_str(p, ", &");
        p = isl_printer_print_str(p, group->get_reply->name);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
    } else {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "CRTS_dma_iput_stride (&");
        p = stmt_print_global_index(p, stmt);
        p = isl_printer_print_str(p, ", &");
        p = stmt_print_local_index(p, stmt);
        p = isl_printer_print_str(p, ", (");
		if (node->u.d.depth == 1) {
    		p = isl_printer_print_ast_expr(p, node->u.d.upper0);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower0);
        	p = isl_printer_print_str(p, " + 1 ");
		} else if (node->u.d.depth == 2) {
        	p = isl_printer_print_str(p, "((");
    		p = isl_printer_print_ast_expr(p, node->u.d.upper0);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower0);
        	p = isl_printer_print_str(p, ") / ");
			p = isl_printer_print_ast_expr(p, node->u.d.stride0);
        	p = isl_printer_print_str(p, " + 1) * ");
        	p = isl_printer_print_str(p, "((");
    		p = isl_printer_print_ast_expr(p, node->u.d.upper1);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower1);
        	p = isl_printer_print_str(p, ") / ");
			p = isl_printer_print_ast_expr(p, node->u.d.stride1);
        	p = isl_printer_print_str(p, " + 1)");
		}
        p = isl_printer_print_str(p, ") * ");
        p = isl_printer_print_str(p, "sizeof(");
        p = isl_printer_print_str(p, array->array->type);
        p = isl_printer_print_str(p, "), ");
		if (node->u.d.depth == 2) {
        	p = isl_printer_print_str(p, "(");
			p = isl_printer_print_ast_expr(p, node->u.d.upper0);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower0);
        	p = isl_printer_print_str(p, " + 1 ) * sizeof(");
			p = isl_printer_print_str(p, array->array->type);
        	p = isl_printer_print_str(p, "), ");
        	p = isl_printer_print_str(p, "((");
			p = isl_printer_print_ast_expr(p, node->u.d.stride1);
        	p = isl_printer_print_str(p, " * ");
			p = isl_printer_print_val(p, gsize);
			p = isl_printer_print_str(p, ") - (");
			p = isl_printer_print_ast_expr(p, node->u.d.upper0);
        	p = isl_printer_print_str(p, " - ");
    		p = isl_printer_print_ast_expr(p, node->u.d.lower0);
        	p = isl_printer_print_str(p, " + 1 ))");
			p = isl_printer_print_str(p, " * sizeof(");
			p = isl_printer_print_str(p, array->array->type);
        	p = isl_printer_print_str(p, ")");
		} else if ((node->u.d.depth == 1) && 
				(isl_val_get_num_si(isl_ast_expr_get_val(node->u.d.stride0)) != 1)){
			p = isl_printer_print_str(p, "1 * sizeof(");
			p = isl_printer_print_str(p, array->array->type);
        	p = isl_printer_print_str(p, "), (");
			p = isl_printer_print_ast_expr(p, node->u.d.stride0);
			p = isl_printer_print_str(p, " - 1 ) * sizeof(");
			p = isl_printer_print_str(p, array->array->type);
			p = isl_printer_print_str(p, ")");
		} else 
        	p = isl_printer_print_str(p, "0, 0");
        p = isl_printer_print_str(p, ", &");
        p = isl_printer_print_str(p, group->put_reply->name);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
    }
	p = end_block(p);

	isl_val_free(gsize);
	return p;
}

static __isl_give isl_printer *print_compute_kernel(__isl_take isl_printer *p,
	__isl_keep isl_ast_node *node)
{
	isl_ast_expr *expr, *expr_tmp;
	isl_size n;

	expr = isl_ast_node_user_get_expr(node);
	n = isl_ast_expr_get_op_n_arg(expr);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "//Compute Kernel");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "GEMM_2X4_32_KERNEL(");
	for (int i = 1; i < n; i++) {
		expr_tmp = isl_ast_expr_op_get_arg(expr, i);
		p = isl_printer_print_ast_expr(p, expr_tmp);
		isl_ast_expr_free(expr_tmp);
		if (i != n-1)
			p = isl_printer_print_str(p, ", ");
	}
	p = isl_printer_print_str(p, ");");
	p = isl_printer_end_line(p);

	return p;
}

/* This function is called for each user statement in the AST,
 * i.e., for each kernel body statement, copy statement or sync statement.
 */
static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
  	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	isl_id *id;
	struct ppcg_kernel_stmt *stmt;
	isl_ast_expr *expr, *arg;
	const char *name;
	struct slave_array_info *array;

	id = isl_ast_node_get_annotation(node);
	if (!id){
		expr = isl_ast_node_user_get_expr(node);
		arg = isl_ast_expr_get_op_arg(expr, 0);
		id = isl_ast_expr_get_id(arg);
		name = isl_id_get_name(id);
		array = isl_id_get_user(id);
		isl_id_free(id);
		isl_ast_expr_free(arg);
		isl_ast_expr_free(expr);

		if (!prefixcmp(name, "to_device"))
			return copy_array_to_device(p, array);
		else
			return copy_array_from_device(p, array);
	}

	stmt = isl_id_get_user(id);

	if (!stmt && (!strcmp(isl_id_get_name(id), "compute_kernel")))
		return print_compute_kernel(p, node);

	isl_id_free(id);

	isl_ast_print_options_free(print_options);

	switch (stmt->type) {
		case ppcg_kernel_copy:
			//return ppcg_kernel_print_copy_for_sw(p, stmt);
		case ppcg_kernel_sync:
			if (stmt->u.s.rma)
				return print_rma_sync(p, stmt);
			else
				return print_sync(p, stmt);
		case ppcg_kernel_domain:
			return ppcg_kernel_print_domain_for_sw(p, stmt);
		case ppcg_kernel_update:
			return print_update(p, stmt);
	}

	return p;
}

static void print_kernel(struct slave_prog *prog, struct ppcg_kernel *kernel,
	struct sw_info *sw)
{
	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
	isl_ast_print_options *print_options;
	isl_printer *p;

	p = isl_printer_to_file(ctx, sw->slave_c);
	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
	//p = declare_device_arrays(p, prog, kernel);
	//p = print_kernel_replys(p, kernel);
	isl_printer_free(p);
	p = isl_printer_end_line(p);
	print_kernel_headers_in_h(prog, kernel, sw);
	print_kernel_headers_in_c(prog, kernel, sw);
	fprintf(sw->slave_c, "{\n");
	print_kernel_iterators(sw->slave_c, kernel);

	p = isl_printer_to_file(ctx, sw->slave_c);
	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
	p = isl_printer_indent(p, 4);

	p = declare_device_arrays(p, prog, kernel);
	p = print_kernel_replys(p, kernel);
	p = print_kernel_arguments_slave(p, prog, kernel);

	p = ppcg_set_macro_names(p);
	p = slave_print_macros(p, kernel->tree);

	print_options = isl_ast_print_options_alloc(ctx);
	print_options = isl_ast_print_options_set_print_user(print_options,
												    &print_kernel_stmt, NULL);
	print_options = isl_ast_print_options_set_print_dma(print_options,
												    &print_dma, NULL);
	//p = slave_ast_node_print(kernel->tree, p, print_options);
	p = isl_ast_node_print(kernel->tree, p, print_options);
	isl_printer_free(p);

	fprintf(sw->slave_c, "}\n");
}

/* Print code for initializing the device for execution of the transformed
 * code.  This includes declaring locally defined variables as well as
 * declaring and allocating the required copies of arrays on the device.
 */
static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
   	struct slave_prog *prog)
{
	//p = slave_print_local_declarations(p, prog);
	//p = declare_device_arrays(p, prog);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "athread_init();");
	p = isl_printer_end_line(p);

	return p;
}						

/* Print code for clearing the device after execution of the transformed code.
 * In particular, free the memory that was allocated on the device.
 */
static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
  	struct slave_prog *prog)
{
	//p = free_device_arrays(p, prog);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "athread_halt();");
	p = isl_printer_end_line(p);

	return p;
}

/* Print a statement for copying an array to or from the device,
 * or for initializing or clearing the device.
 * The statement identifier of a copying node is called
 * "to_device_<array name>" or "from_device_<array name>" and
 * its user pointer points to the gpu_array_info of the array
 * that needs to be copied.
 * The node for initializing the device is called "init_device".
 * The node for clearing the device is called "clear_device".
 *
 * Extract the array (if any) from the identifier and call
 * init_device, clear_device, copy_array_to_device or copy_array_from_device.
 */
static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
	__isl_keep isl_ast_node *node, struct slave_prog *prog)
{
	isl_ast_expr *expr, *arg;
	isl_id *id;
	const char *name;
	struct slave_array_info *array;

	expr = isl_ast_node_user_get_expr(node);
	arg = isl_ast_expr_get_op_arg(expr, 0);
	id = isl_ast_expr_get_id(arg);
	name = isl_id_get_name(id);
	array = isl_id_get_user(id);
	isl_id_free(id);
	isl_ast_expr_free(arg);
	isl_ast_expr_free(expr);

	if (!name)
		return isl_printer_free(p);
	if (!strcmp(name, "init_device"))
		return init_device(p, prog);
	if (!strcmp(name, "clear_device"))
		return clear_device(p, prog);
	//if (!array)
		return isl_printer_free(p);

	/*if (!prefixcmp(name, "to_device"))
		return copy_array_to_device(p, array);
	else
		return copy_array_from_device(p, array);*/
}

struct print_host_user_data {
	struct sw_info *sw;
	struct slave_prog *prog;
};

/* Print the user statement of the host code to "p".
 *
 * The host code may contain original user statements, kernel launches,
 * statements that copy data to/from the device and statements
 * the initialize or clear the device.
 * The original user statements and the kernel launches have
 * an associated annotation, while the other statements do not.
 * The latter are handled by print_device_node.
 * The annotation on the user statements is called "user".
 *
 * In case of a kernel launch, print a block of statements that
 * defines the grid and the block and then launches the kernel.
 */
static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	isl_id *id;
	int is_user;
	struct ppcg_kernel *kernel;
	struct ppcg_kernel_stmt *stmt;
	struct print_host_user_data *data;

	isl_ast_print_options_free(print_options);

	data = (struct print_host_user_data *) user;

	id = isl_ast_node_get_annotation(node);
	if (!id)
		return print_device_node(p, node, data->prog);

	is_user = !strcmp(isl_id_get_name(id), "user");
	kernel = is_user ? NULL : isl_id_get_user(id);
	stmt = is_user ? isl_id_get_user(id) : NULL;
	isl_id_free(id);

	if (is_user)
		return ppcg_kernel_print_domain(p, stmt);

	p = print_kernel_arguments_host(p, data->prog, kernel);

	p = ppcg_start_block(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "athread_spawn(kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, ", arg");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, ")");
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "athread_join();");
	p = isl_printer_end_line(p);

	p = ppcg_end_block(p);

	print_kernel(data->prog, kernel, data->sw);

	return p;
}

static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
	struct slave_prog *prog, __isl_keep isl_ast_node *tree,
	struct sw_info *sw)
{
	isl_ast_print_options *print_options;
	isl_ctx *ctx = isl_ast_node_get_ctx(tree);
	struct print_host_user_data data = { sw, prog };

	print_options = isl_ast_print_options_alloc(ctx);
	print_options = isl_ast_print_options_set_print_user(print_options,
						&print_host_user, &data);

	p = slave_print_macros(p, tree);
	p = isl_ast_node_print(tree, p, print_options);

	return p;
}

/* Given a slave_prog "prog" and the corresponding transformed AST
 * "tree", print the entire SW code to "p".
 * "types" collects the types for which a definition has already
 * been printed.
 */
static __isl_give isl_printer *print_sw(__isl_take isl_printer *p,
	struct slave_prog *prog, __isl_keep isl_ast_node *tree,
	struct slave_types *types, void *user)
{
	struct sw_info *sw = user;
	isl_printer *kernel;

	kernel = isl_printer_to_file(isl_printer_get_ctx(p), sw->slave_c);
	kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
	kernel = slave_print_types(kernel, types, prog);
	isl_printer_free(kernel);

	if (!kernel)
		return isl_printer_free(p);

	p = print_host_code(p, prog, tree, sw);

	return p;
}

/* Transform the code in the file called "input" by replacing
 * all scops by correspoing SW code.
 * The names of the output files are derived from "input".
 */
int generate_sw(isl_ctx *ctx, struct ppcg_options *options,
	const char *input)
{
	struct sw_info sw;
	int r = 0;

	sw_open_files(&sw, input);

	r = generate_slave(ctx, input, sw.host_c, options, &print_sw, &sw);

	sw_close_files(&sw);

	return r;
}
